# initialize log file
my_log <- file("log.txt")
sink(my_log, append = TRUE, type = "output")
sink(my_log, append = TRUE, type = "message")
cat(readChar(rstudioapi::getSourceEditorContext()$path,
             file.info(rstudioapi::getSourceEditorContext()$path)$size))

# Replication code for
# "The Development of the Urban-Rural Cleavage 
# in Anglo-American Democracies"
#
# Comparative Political Studies
#
# Zack Taylor, Western University
# Jack Lucas, University of Calgary
# David A. Armstrong II, Western University
# Ryan Bakker, University of Essex
#
# July 14, 2023

library(tidyverse)
library(rio)
library(nnet)
library(ggridges)
library(broom)
#library(ggpubr)
library(patchwork)
library(ggrepel)
library(compositions)
library(progress)
library(abind)

# Import and combine data ----

can <- rio::import("can.dta") %>% 
  filter(win==1) %>% 
  dplyr::select(winner = ptygrp1, eyear, logdensity, case, region) %>% 
  filter(eyear>=1920)

usa <- rio::import("usa_cq.dta") %>% 
  filter(win==1) %>% 
  dplyr::select(winner = ptygrp2, eyear, logdensity, case, region)

gbr <- rio::import("gbr.dta") %>% 
  filter(win==1) %>% 
  dplyr::select(winner = ptygrp1, eyear, logdensity, case, region = division) %>%
  filter(eyear>=1928) %>%
  mutate(region = ifelse(region == "Eastern", "East", region))

df <- rbind(can, usa, gbr) %>% 
  mutate(country_elec = paste(case, eyear)) %>%
  filter(!is.na(logdensity))

df <- df %>%
  mutate(case = case_when(
    case == "CAN" ~ "Canada", 
    case == "USA" ~ "United States", 
    case == "GBR" ~ "Britain", 
    TRUE ~ NA_character_), 
    case = factor(case, levels=c("Canada", "Britain", "United States")))

# Figure 1: Distributions of Log-Transformed Population Density by Election ----

ggplot(df, aes(x=logdensity, y=eyear, group=eyear, fill = factor(stat(quantile)))) + 
  stat_density_ridges(
    geom = "density_ridges_gradient", calc_ecdf = TRUE,
    quantiles = 4, quantile_lines = TRUE) +
  scale_fill_manual(values = c("#bcbddc", "#756bb1", "#756bb1", "#bcbddc")) + 
  facet_wrap(~case, ncol = 3) + 
  xlab("") + 
  ylab("Election Year") + 
  theme_minimal() + 
  theme(panel.border = element_rect(fill=NA, size=0.5), 
        legend.position="none", 
        axis.text.y = element_text(size=6), 
        axis.text.x = element_text(size=6, angle=60, hjust=1),
        axis.title.y = element_text(size=8),
        title = element_text(size=8), 
        strip.text = element_text(size=8)) + 
  scale_x_continuous(limits=c(-6,11), breaks=c(-4.60517, 0, 4.60517, 9.21034), labels=c(0.01, 1, 100, 10000)) + 
  scale_y_continuous(breaks=seq(1900,2022,2), expand=c(0,0), limits=c(1920, 2024))  

scale = 1.25
ggsave("Figure 1_Distributions of Log-Transformed Population Density by Election.png",
       width = 4.25 / scale, height = 6 / scale, dpi = 600, scale = scale, bg = "white")

# Figure 2: Estimated Proportional Reduction in Error due to Urbanity ----

sp <- split(df, df$country_elec)
names(sp) <- sapply(sp, function(x)x$country_elec[1])

sp <- lapply(sp, function(x){
  x$winner <- droplevels(factor(x$winner))
  x
})
instances <- unique(df$country_elec)
df$case <- factor(df$case, labels=c("Canada", "Britain", "United States"))

pb <- progress_bar$new(total= length(sp))

# epre
epre.results <- NULL
for(i in instances){
  tmp <- df %>% filter(country_elec==i)
  tmp <- tmp %>% 
    group_by(winner) %>% 
    mutate(n = n()) %>% 
    filter(n >= 15)
  winning.parties <- length(unique(tmp$winner))
  if(winning.parties>2){# this is the multinomial code for elections with more than two winning parties
    # null model and density model
    null <- multinom(winner ~ region, data=tmp %>% filter(country_elec==i), trace=F)
    urb <- multinom(winner ~ logdensity + region, data=tmp %>% filter(country_elec==i), trace=F)
    # calculate necessary ingredients for epre
    y <- model.frame(urb)[,1]
    py <- table(y)/sum(table(y))
    pmc <- mean(py[match(y, names(py))])
    pred1 <- null$fitted.values
    pred2 <- urb$fitted.values
    p1 <- pred1[cbind(1:nrow(pred1), match(y, colnames(pred1)))]
    p2 <- pred2[cbind(1:nrow(pred2), match(y, colnames(pred2)))]
    epre <- data.frame(epre = (mean(p2) - mean(p1)) / (1-mean(p1)), election = i, type = "multinom")
    epre.results <- rbind(epre.results, epre)}
  else{# now the code for cases with two parties
    # first, create a binary variable where 0 = one of the parties and 1 = the other party
    winning.parties <- unique(tmp$winner)
    tmp$winner_binary <- NA
    tmp$winner_binary[tmp$winner==winning.parties[1]] <- 0
    tmp$winner_binary[tmp$winner==winning.parties[2]] <- 1
    # now run the model
    null <- glm(winner_binary ~ region, data=tmp, family=binomial(link="logit"))
    urb <- glm(winner_binary ~ logdensity + region, data=tmp, family=binomial(link="logit"))
    # extract the observed outcomes and calculate the probability that y=1
    y <- model.frame(urb)[,1]
    py <- mean(y)
    # extract the probabilities 
    pred1 <- null$fitted.values
    pred2 <- urb$fitted.values
    # calculate epcp and epmc 
    epcp <- sum(ifelse(y==1,pred2,(1-pred2)))/length(pred2)
    epmc <- sum(ifelse(y==1,pred1,(1-pred1)))/length(pred1)
    # calculate epre
    epre <- data.frame(epre = (epcp - epmc) / (1-epmc), election = i, type = "logit")
    epre.results <- rbind(epre.results, epre)
  }
  pb$tick()
}
epre.results <- epre.results %>% mutate(country = substr(election,1,3), year = as.numeric(substr(election,5,9)), 
                                        case = case_when(country == "CAN" ~ "Canada", 
                                                         country == "GBR" ~ "Britain", 
                                                         country == "USA" ~ "United States"), 
                                        case = factor(case, levels=c("Canada", "Britain", "United States")))

ggplot(epre.results, aes(x=year, y=epre)) + 
  geom_point(size = .25, shape = 21, fill = "black") +
  geom_line() + 
  scale_y_continuous(label=scales::percent) + 
  scale_x_continuous(breaks=seq(1900, 2020, by = 10)) +
  facet_wrap(~case) + 
  theme_minimal() + 
  theme(axis.text.x=element_text(angle = -90, hjust = -2),
        axis.text = element_text(size=6), 
        axis.title=element_text(size=8), 
        strip.text = element_text(size=8), 
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.border = element_rect(fill=NA, size=0.5)) +
  labs(x="Election Year", y= "Expected Proportional Reduction in Error")+
  geom_hline(yintercept=0, linetype=3) 

scale = 1.25
ggsave("Figure 2_Estimated Proportional Reduction in Error due to Urbanity.png",
       width = 4.25 / scale, height = 3 / scale, dpi = 600, scale = scale, bg = "white")

# Figure 3: The Ten Highest ePRE Scores by Country ----

topten <- epre.results %>% 
  group_by(country) %>% 
  slice_max(epre, n=10) %>%
  mutate(rank = rank(-epre))
ggplot(topten, aes(x=year, y=country, fill=rank)) + 
  geom_label(aes(label=rank), size=1) + 
  xlab("Election Year\n") + ylab("") + 
  scale_x_continuous(breaks=seq(1920,2025,10)) + 
  scale_fill_gradient(low = "#08519c", high = "#eff3ff") + 
  theme_minimal() + 
  theme(panel.border = element_rect(fill=NA, size=0.5),
        axis.text.x = element_text(angle=-90, size=7),
        axis.text = element_text(size=6), 
        axis.title = element_text(size=8),
        legend.position = "none")
# + ggtitle("Ten Highest EPRE Scores, by Country")

scale = 1.25
ggsave("Figure 3_The Ten Highest ePRE Scores by Country.png",
       width = 4.25 / scale, height = 1.95 / scale, dpi = 600, scale = scale, bg = "white")

# Figure 4: Expected Shift in Party Vote Share, Lowest Density to Highest Density District ----
# Also creates diagnostics that appear in the appendix

`%x%` <- function(X,Y)paste(ifelse(is.na(Y),"", X), collapse="")
inl <- function(x, y){
  sapply(y, \(yy)x %in% yy)
}

boot_cr <- function(obj, d1, d2, cn = NULL, R=2500, ...){
  require(compositions)
  require(abind)
  ncb <- ncol(coef(obj))
  if(is.null(ncb))ncb <- 1
  b <- c(coef(obj))
  V <- vcov(obj)
  B <- MASS::mvrnorm(R, b, V)
  b_mats <- lapply(1:nrow(B), \(i)matrix(B[i,], ncol=ncb))
  X1 <- model.matrix(obj, data=d1)
  X2 <- model.matrix(obj, data=d2)
  xb1 <- lapply(b_mats, \(b) apply(alrInv(X1 %*% b), 2, as.numeric))
  xb2 <- lapply(b_mats, \(b) apply(alrInv(X2 %*% b), 2, as.numeric))
  xb1$along <- xb2$along <- 3
  xba1 <- do.call(abind, xb1)
  xba2 <- do.call(abind, xb2)
  xb_diff <- xba2-xba1  
  attr(xb_diff, "parties") <- cn
  xb_diff
}


## Canada ----

can <- rio::import("can.dta") %>% 
  dplyr::select(distnameflat,logdensity, case, eyear, pty, pct, win, region, ptygrp1, ptygrp2) %>%
  # mutate(ptygrp1 = ifelse(ptygrp1 %in% c("CCF_NDP", "Liberal", "Conservative"), ptygrp1, "Other")) %>% 
  #   filter(ptygrp1 == "CCF_NDP" | ptygrp1 == "Liberal" | ptygrp1 == "Conservative") %>%
  #  rename(ptygrp1 = ptygrp2, ptygrp2 = ptygrp1) %>% 
  filter(region!="The North") %>% filter(eyear>=1920)

## Total district-years
can %>% group_by(distnameflat, eyear) %>% slice_head(n=1) %>% nrow()

## Total competitive district-years
can_comp_ridings <- can %>% 
  group_by(distnameflat, eyear) %>% 
  filter(n() > 1) %>% 
  slice_head(n=1) %>% 
  nrow()
can_comp_ridings


can <- can %>% 
  group_by(ptygrp1) %>% 
  mutate(abb1 = paste0("p", cur_group_id()))%>%
  group_by(distnameflat, eyear) %>% 
  mutate(prof1 = paste(sort(unique(abb1)), collapse="_"))
can_ag <- can %>% 
  group_by(distnameflat, eyear) %>% 
  reframe(prof1 = paste(sort(unique(abb1)), collapse="_"))
can_ag1 <- can_ag %>% group_by(eyear, prof1) %>% tally()
incl_prof <- can_ag1 %>% ungroup %>% filter(n>= 10) %>% select(prof1) %>% pull() %>% unique()

n <- can %>% 
  filter(prof1 %in% incl_prof) %>% 
  group_by(eyear, distnameflat, prof1) %>% 
  slice_head(n=1) %>% 
  group_by(eyear, prof1) %>% 
  tally() %>% 
  ungroup %>% 
  select(n) %>% 
  pull
## total included districts
sum(n[which(n >= 10)])
## Proportion of total competitive districts
sum(n[which(n >= 10)])/can_comp_ridings

tmp_can <- can %>% 
  group_by(distnameflat, case, eyear, logdensity, region, ptygrp1) %>% 
  summarise(pct = sum(pct)) %>% 
  pivot_wider(id_cols = c("distnameflat", "case", "eyear", "logdensity", "region"), values_from="pct", names_from="ptygrp1", names_prefix = "pct_") 


prof <- tmp_can %>% 
  ungroup %>% 
  select(starts_with("pct_"))
tx <- c("F", "C", "L", "T", "I", "B", "R", "P")

ptymap <- data.frame(prof = tx, party = c("CCF-NDP", "Conservative", "Liberal", "Third", "Independent", "Bloc", "Reform-Alliance", "Progressive"))

tmp_can$profile <- apply(prof, 1, \(x)tx %x% x)
tmp_can <- tmp_can %>%
  group_by(eyear, profile) %>% 
  mutate(n = n()) %>%
  ungroup %>% 
  mutate(incl_n = n >= 10, 
         incl_comp = nchar(profile) > 1, 
         incl = incl_n & incl_comp)

## Districts excluded by year
tmp_can %>% group_by(eyear) %>% summarise(excl = sum(!incl)) %>% 
  mutate(cs = cumsum(excl), 
         cp = cs/sum(excl)) %>% 
  as.data.frame()

## Districts excluded by region
tmp_can %>% group_by(region) %>% summarise(excl = sum(!incl)) %>% 
  mutate(p = excl/sum(excl)) %>% 
  as.data.frame()

### Figure B1: Canada, Density by inclusion/exclusion boxplot ----
tmp_can %>% 
  mutate(included = ifelse(incl, "Included", "Excluded")) %>% 
  ggplot(aes(x=included, y=logdensity)) + 
  geom_boxplot() + 
  theme_classic() + 
  labs(x="", y="log(Density)")
ggsave("Figure B1_Canada.png", 
       height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale, dpi = 600)

### Figure B2: Canada, District-Years in Which Parties are Included/Excluded ----
excl_pty <- tmp_can %>% 
  filter(!incl) %>% 
  select(profile) %>% 
  pull %>% 
  table %>% 
  as_tibble() %>% 
  setNames(c("profile", "n")) %>% 
  mutate(prof = str_split(profile, "")) %>% 
  unnest(prof) %>% 
  group_by(prof) %>% 
  summarise(n_excl = sum(n))

incl_pty <- tmp_can %>% 
  filter(incl) %>% 
  select(profile) %>% 
  pull %>% 
  table %>% 
  as_tibble() %>% 
  setNames(c("profile", "n")) %>% 
  mutate(prof = str_split(profile, "")) %>% 
  unnest(prof) %>% 
  group_by(prof) %>% 
  summarise(n_incl = sum(n))
full_join(incl_pty, excl_pty) %>% left_join(ptymap) %>% 
  select(party, n_incl, n_excl) %>% 
  rename(Included = n_incl, Excluded=n_excl) %>% 
  mutate(Included_pct = round((Included/(Included + Excluded))*100), 
         Excluded_pct = round((Excluded/(Included + Excluded))*100)) %>%
  knitr::kable()

tmp_can <- tmp_can %>% 
  ungroup %>% 
  filter(nchar(profile) > 1) %>% 
  group_by(profile, eyear) %>% 
  mutate(n=n()) %>% 
  filter(n >= 10)

unyrs <- sort(unique(tmp_can$eyear))
res <- list()
k <- 1
pb <- progress_bar$new(total = length(unyrs))


for(i in 1:length(unyrs)){
  pb$tick()
  tmp <- tmp_can %>% 
    filter(eyear == unyrs[i])
  
  profs <- table(tmp$profile)
  for(j in 1:length(profs)){
    tmpj <- tmp %>% filter(profile == names(profs)[j])
    comp <- tmpj %>% ungroup %>% select(starts_with("pct_")) %>% as.matrix()
    comp <- comp[,-which(apply(comp, 2, \(x)all(is.na(x)))), drop=FALSE]
    mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity +region, data=tmpj))
    if(inherits(mod, "try-error")){
      mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity, data=tmpj))
    }
    d1 <- tmpj %>% 
      group_by(region) %>% 
      mutate(logdensity = min(logdensity, na.rm=TRUE))
    d2 <- tmpj %>% 
      group_by(region) %>% 
      mutate(logdensity = max(logdensity, na.rm=TRUE))
    diff <- boot_cr(mod, d1, d2, cn=gsub("pct_", "", colnames(comp)))
    res[[k]] <- list(id = data.frame(n = nrow(tmpj), case = tmpj$case[1], year = tmpj$eyear[1]), 
                     diff = diff)
    k <- k+1
    
    
  }
}

id_dat <- lapply(res, \(x)x$id) %>% 
  bind_rows() %>% 
  mutate(obs = 1:n())

sp <- split(id_dat, id_dat$year)
allout <- NULL
for(i in 1:length(sp)){
  if(nrow(sp[[i]]) == 1){
    zz <- apply(res[[sp[[i]]$obs]]$diff, c(2,3), mean)
    m <- rowMeans(zz)
    l <- apply(zz, 1, quantile, .025)
    u <- apply(zz, 1, quantile, .975)
    out1 <- out2 <- rbind(m, l, u)
    rownames(out1) <- c("mean1", "lower1", "upper1")
    rownames(out2) <- c("mean2", "lower2", "upper2")
    colnames(out1) <- colnames(out2) <- attr(res[[sp[[i]]$obs]]$diff, "parties")
    out <- bind_cols(t(out1), t(out2)) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = colnames(out1)) %>% 
      bind_cols(sp[[i]]) %>% 
      select(case, year, party, everything()) %>% 
      select(-n, -obs)
  }else{
    parties <- lapply(sp[[i]]$obs, \(i)attr(res[[i]]$diff, "parties"))
    diffs <- lapply(sp[[i]]$obs, \(i)res[[i]]$diff)
    unparties <- unique(c(unlist(parties)))
    inparts <- sapply(unparties, \(z)as.numeric(inl(z, parties)))
    inparts2 <- inparts
    inparts2[which(inparts2 == 0, arr.ind=TRUE)] <- 1
    n <- sp[[i]]$n
    pct1 <- apply(inparts, 2, \(z)z*n)
    pct2 <- apply(inparts2, 2, \(z)z*n)
    pct1 <- prop.table(pct1, 2)
    pct2 <- prop.table(pct2, 2)
    eff1 <- eff2 <- NULL
    for(p in unparties){
      laps <- lapply(parties, \(x){w <- which(x == p)})
      laps <- sapply(laps, \(x)ifelse(length(x) > 0, x, 0))
      d <- lapply(1:length(laps), \(k){
        if(laps[k] != 0){
          diffs[[k]][,laps[k],]
        }else{
          array(0, dim = dim(diffs[[k]])[c(1,3)])
        }
      })
      effs <- sapply(d, colMeans)
      eff1 <- cbind(eff1, effs %*% pct1[,p])
      eff2 <- cbind(eff2, effs %*% pct2[,p])
    }
    m1 <- colMeans(eff1)
    m2 <- colMeans(eff2)
    l1 <- apply(eff1, 2, quantile, .025)
    l2 <- apply(eff2, 2, quantile, .025)
    u1 <- apply(eff1, 2, quantile, .975)
    u2 <- apply(eff2, 2, quantile, .975)
    out <- data.frame(mean1 = m1, lower1 = l1, upper1 = u1, 
                      mean2 = m2, lower2 = l2, upper2 = u2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = unparties) %>% 
      cbind(sp[[i]] %>% slice(1) %>% select(case, year)) %>% 
      select(case, year, party, everything()) 
  }
  allout <- rbind(allout, out)
}

allout <- allout %>% 
  mutate(orig_party = party, 
         party = case_when(
           party %in% c("CCF", "NDP") ~ "NDP", 
           party == "Liberal" ~ "Lib", 
           party == "Conservative" ~ "Con", 
           TRUE ~ NA_character_), 
         party = factor(party, levels=c("NDP", "Lib", "Con"))) %>% 
  mutate(label = if_else(year == max(year), as.character(party), NA_character_))
output <- allout

### By region

tmp_canr <- tmp_can %>% 
  ungroup %>% 
  filter(nchar(profile) > 1) %>% 
  group_by(profile, eyear, region) %>% 
  mutate(n=n()) %>% 
  filter(n >= 10)

can_regs <- tmp_canr %>% 
  group_by(eyear, region, profile) %>% 
  select(eyear, region, profile) %>% 
  slice_head(n=1)

res <- list()
pb <- progress_bar$new(total = nrow(can_regs))

for(i in 1:nrow(can_regs)){
  pb$tick()
  tmpj <- tmp_canr %>% 
    filter(eyear == can_regs$eyear[i] & region == can_regs$region[i] & profile == can_regs$profile[i])
  comp <- tmpj %>% ungroup %>% select(starts_with("pct_")) %>% as.matrix()
  comp <- comp[,-which(apply(comp, 2, \(x)all(is.na(x)))), drop=FALSE]
  mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity, data=tmpj))
  d1 <- tmpj %>% 
    group_by(region) %>% 
    mutate(logdensity = min(logdensity, na.rm=TRUE))
  d2 <- tmpj %>% 
    group_by(region) %>% 
    mutate(logdensity = max(logdensity, na.rm=TRUE))
  diff <- boot_cr(mod, d1, d2, cn=gsub("pct_", "", colnames(comp)))
  res[[i]] <- list(id = data.frame(n = nrow(tmpj), case = tmpj$case[1], year = tmpj$eyear[1], profile=tmpj$profile[1], region = tmpj$region[1]), 
                   diff = diff)
}

id_dat <- lapply(res, \(x)x$id) %>% 
  bind_rows() %>% 
  mutate(obs = 1:n())

sp <- id_dat %>% 
  group_by(year, region) %>% 
  group_split()

allout <- NULL
for(i in 1:length(sp)){
  if(nrow(sp[[i]]) == 1){
    zz <- apply(res[[sp[[i]]$obs]]$diff, c(2,3), mean)
    m <- rowMeans(zz)
    l <- apply(zz, 1, quantile, .025)
    u <- apply(zz, 1, quantile, .975)
    out1 <- out2 <- rbind(m, l, u)
    rownames(out1) <- c("mean1", "lower1", "upper1")
    rownames(out2) <- c("mean2", "lower2", "upper2")
    colnames(out1) <- colnames(out2) <- attr(res[[sp[[i]]$obs]]$diff, "parties")
    out <- bind_cols(t(out1), t(out2)) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = colnames(out1)) %>% 
      bind_cols(sp[[i]]) %>% 
      select(case, year, party, everything()) %>% 
      select(-n, -obs)
  }else{
    parties <- lapply(sp[[i]]$obs, \(i)attr(res[[i]]$diff, "parties"))
    diffs <- lapply(sp[[i]]$obs, \(i)res[[i]]$diff)
    unparties <- unique(c(unlist(parties)))
    inparts <- sapply(unparties, \(z)as.numeric(inl(z, parties)))
    inparts2 <- inparts
    inparts2[which(inparts2 == 0, arr.ind=TRUE)] <- 1
    n <- sp[[i]]$n
    pct1 <- apply(inparts, 2, \(z)z*n)
    pct2 <- apply(inparts2, 2, \(z)z*n)
    pct1 <- prop.table(pct1, 2)
    pct2 <- prop.table(pct2, 2)
    eff1 <- eff2 <- NULL
    for(p in unparties){
      laps <- lapply(parties, \(x){w <- which(x == p)})
      laps <- sapply(laps, \(x)ifelse(length(x) > 0, x, 0))
      d <- lapply(1:length(laps), \(k){
        if(laps[k] != 0){
          diffs[[k]][,laps[k],]
        }else{
          array(0, dim = dim(diffs[[k]])[c(1,3)])
        }
      })
      effs <- sapply(d, colMeans)
      eff1 <- cbind(eff1, effs %*% pct1[,p])
      eff2 <- cbind(eff2, effs %*% pct2[,p])
    }
    m1 <- colMeans(eff1)
    m2 <- colMeans(eff2)
    l1 <- apply(eff1, 2, quantile, .025)
    l2 <- apply(eff2, 2, quantile, .025)
    u1 <- apply(eff1, 2, quantile, .975)
    u2 <- apply(eff2, 2, quantile, .975)
    out <- data.frame(mean1 = m1, lower1 = l1, upper1 = u1, 
                      mean2 = m2, lower2 = l2, upper2 = u2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = unparties) %>% 
      cbind(sp[[i]] %>% slice(1) %>% select(case, year, region, profile)) %>% 
      select(case, year, party, everything()) 
  }
  allout <- rbind(allout, out)
}

allout <- allout %>% 
  mutate(orig_party = party, 
         party = case_when(
           party %in% c("CCF", "NDP") ~ "NDP", 
           party == "Liberal" ~ "Lib", 
           party == "Conservative" ~ "Con", 
           TRUE ~ NA_character_), 
         party = factor(party, levels=c("NDP", "Lib", "Con"))) %>% 
  mutate(label = if_else(year == max(year), as.character(party), NA_character_))

output_reg <- allout


## Great Britain ----

gbr <- rio::import("gbr.dta") %>% 
  dplyr::select(distnameflat,logdensity, case, eyear, pty, pct, win, region = division, ptygrp1, ptygrp2, region2 = region) %>%
  #  filter(ptygrp1 == "conservative" | ptygrp1 == "labour" | ptygrp1 == "liberal") %>% 
  filter(eyear>=1928) %>%
  mutate(pct = ifelse(pct == 0, NA, pct)) %>% 
  mutate(region = ifelse(region == "Eastern", "East", region))

gbr %>% group_by(distnameflat, eyear) %>% slice_head(n=1) %>% nrow()

## Total competitive district-years
gbr_comp_ridings <- gbr %>% 
  group_by(distnameflat, eyear) %>% 
  filter(n() > 1) %>% 
  slice_head(n=1) %>% 
  nrow()
gbr_comp_ridings


gbr <- gbr %>% 
  group_by(ptygrp1) %>% 
  mutate(abb1 = paste0("p", cur_group_id()))%>%
  group_by(distnameflat, eyear) %>% 
  mutate(prof1 = paste(sort(unique(abb1)), collapse="_"))
gbr_ag <- gbr %>% 
  group_by(distnameflat, eyear) %>% 
  reframe(prof1 = paste(sort(unique(abb1)), collapse="_"))
gbr_ag1 <- gbr_ag %>% group_by(eyear, prof1) %>% tally()
incl_prof <- gbr_ag1 %>% ungroup %>% filter(n>= 10) %>% select(prof1) %>% pull() %>% unique()

n <- gbr %>% 
  filter(prof1 %in% incl_prof) %>% 
  group_by(eyear, distnameflat, prof1) %>% 
  slice_head(n=1) %>% 
  group_by(eyear, prof1) %>% 
  tally() %>% 
  ungroup %>% 
  select(n) %>% 
  pull
## total included districts
sum(n[which(n >= 10)])
## Proportion of total competitive districts
sum(n[which(n >= 10)])/gbr_comp_ridings


tmp_gbr <- gbr %>% 
  group_by(distnameflat, case, eyear, logdensity, region, ptygrp1) %>% 
  summarise(pct = sum(pct)) %>% 
  pivot_wider(id_cols = c("distnameflat", "case", "eyear", "logdensity", "region"), values_from="pct", names_from="ptygrp1", names_prefix = "pct_") %>%
  select(-"pct_")

prof <- tmp_gbr %>% 
  ungroup %>% 
  select(starts_with("pct_"))
tx <- c("C", "B", "L", "W", "O", "S", "E", "N")


ptymap <- data.frame(prof = tx, party = c("Conservative", "Labour", "Liberal", "Welsh Nationalist", "Other", "Scottish Nationalist", "English Nationalist", "Nationalist"))

tmp_gbr$profile <- apply(prof, 1, \(x)tx %x% x)
tmp_gbr <- tmp_gbr %>%
  group_by(eyear, profile) %>% 
  mutate(n = n()) %>%
  ungroup %>% 
  mutate(incl_n = n >= 10, 
         incl_comp = nchar(profile) > 1, 
         incl = incl_n & incl_comp)

## Excluded districts by Year
tmp_gbr %>% group_by(eyear) %>% summarise(excl = sum(!incl)) %>% 
  mutate(cs = cumsum(excl), 
         cp = cs/sum(excl)) %>% 
  as.data.frame()

## Excluded districts by Region
tmp_gbr %>% group_by(region) %>% summarise(excl = sum(!incl)) %>% 
  mutate(p = excl/sum(excl)) %>% 
  as.data.frame()

### Figure B3: Great Britain, Density by Inclusion/Exclusion Boxplot ----
tmp_gbr %>% 
  mutate(included = ifelse(incl, "Included", "Excluded")) %>% 
  ggplot(aes(x=included, y=logdensity)) + 
  geom_boxplot() + 
  theme_classic() + 
  labs(x="", y="log(Density)")
ggsave("Figure B3_Great Britain.png", 
       height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale, dpi = 600)

### Figure B4: Great Britain, District-Years in Which Parties are Included/Excluded ----
excl_pty <- tmp_gbr %>% 
  filter(!incl) %>% 
  select(profile) %>% 
  pull %>% 
  table %>% 
  as_tibble() %>% 
  setNames(c("profile", "n")) %>% 
  mutate(prof = str_split(profile, "")) %>% 
  unnest(prof) %>% 
  group_by(prof) %>% 
  summarise(n_excl = sum(n))

incl_pty <- tmp_gbr %>% 
  filter(incl) %>% 
  select(profile) %>% 
  pull %>% 
  table %>% 
  as_tibble() %>% 
  setNames(c("profile", "n")) %>% 
  mutate(prof = str_split(profile, "")) %>% 
  unnest(prof) %>% 
  group_by(prof) %>% 
  summarise(n_incl = sum(n))
full_join(incl_pty, excl_pty) %>% left_join(ptymap) %>% 
  select(party, n_incl, n_excl) %>% 
  rename(Included = n_incl, Excluded=n_excl) %>% 
  mutate(Included_pct = round((Included/(Included + Excluded))*100), 
         Excluded_pct = round((Excluded/(Included + Excluded))*100)) %>%
  knitr::kable()



tmp_gbr <- tmp_gbr %>% 
  ungroup %>% 
  filter(nchar(profile) > 1) %>% 
  group_by(profile, eyear) %>% 
  mutate(n=n()) %>% 
  filter(n >= 10)


unyrs <- sort(unique(tmp_gbr$eyear))
res <- list()
k <- 1
pb <- progress_bar$new(total = length(unyrs))


for(i in 1:length(unyrs)){
  pb$tick()
  tmp <- tmp_gbr %>% 
    filter(eyear == unyrs[i]) 
  
  profs <- table(tmp$profile)
  library(progress)
  for(j in 1:length(profs)){
    tmpj <- tmp %>% filter(profile == names(profs)[j])
    comp <- tmpj %>% ungroup %>% select(starts_with("pct_")) %>% as.matrix()
    if(any(apply(comp, 2, \(x)all(is.na(x))))){
      comp <- comp[,-which(apply(comp, 2, \(x)all(is.na(x)))), drop=FALSE]  
    }
    mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity +region, data=tmpj))
    if(inherits(mod, "try-error")){
      mod <- lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity, data=tmpj)
    }
    d1 <- tmpj %>% 
      group_by(region) %>% 
      mutate(logdensity = min(logdensity))
    d2 <- tmpj %>% 
      group_by(region) %>% 
      mutate(logdensity = max(logdensity))
    diff <- boot_cr(mod, d1, d2, cn=gsub("pct_", "", colnames(comp)))
    res[[k]] <- list(id = data.frame(n = nrow(tmpj), case = tmpj$case[1], year = tmpj$eyear[1]), 
                     diff = diff)
    k <- k+1
    
    
  }
}

id_dat <- lapply(res, \(x)x$id) %>% 
  bind_rows() %>% 
  mutate(obs = 1:n())

sp <- split(id_dat, id_dat$year)
allout <- NULL
for(i in 1:length(sp)){
  if(nrow(sp[[i]]) == 1){
    zz <- apply(res[[sp[[i]]$obs]]$diff, c(2,3), mean)
    m <- rowMeans(zz)
    l <- apply(zz, 1, quantile, .025)
    u <- apply(zz, 1, quantile, .975)
    out1 <- out2 <- rbind(m, l, u)
    rownames(out1) <- c("mean1", "lower1", "upper1")
    rownames(out2) <- c("mean2", "lower2", "upper2")
    colnames(out1) <- colnames(out2) <- attr(res[[sp[[i]]$obs]]$diff, "parties")
    out <- bind_cols(t(out1), t(out2)) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = colnames(out1)) %>% 
      bind_cols(sp[[i]]) %>% 
      select(case, year, party, everything()) %>% 
      select(-n, -obs)
  }else{
    parties <- lapply(sp[[i]]$obs, \(i)attr(res[[i]]$diff, "parties"))
    diffs <- lapply(sp[[i]]$obs, \(i)res[[i]]$diff)
    unparties <- unique(c(unlist(parties)))
    inparts <- sapply(unparties, \(z)as.numeric(inl(z, parties)))
    inparts2 <- inparts
    inparts2[which(inparts2 == 0, arr.ind=TRUE)] <- 1
    n <- sp[[i]]$n
    pct1 <- apply(inparts, 2, \(z)z*n)
    pct2 <- apply(inparts2, 2, \(z)z*n)
    pct1 <- prop.table(pct1, 2)
    pct2 <- prop.table(pct2, 2)
    eff1 <- eff2 <- NULL
    for(p in unparties){
      laps <- lapply(parties, \(x){w <- which(x == p)})
      laps <- sapply(laps, \(x)ifelse(length(x) > 0, x, 0))
      d <- lapply(1:length(laps), \(k){
        if(laps[k] != 0){
          diffs[[k]][,laps[k],]
        }else{
          array(0, dim = dim(diffs[[k]])[c(1,3)])
        }
      })
      effs <- sapply(d, colMeans)
      eff1 <- cbind(eff1, effs %*% pct1[,p])
      eff2 <- cbind(eff2, effs %*% pct2[,p])
    }
    m1 <- colMeans(eff1)
    m2 <- colMeans(eff2)
    l1 <- apply(eff1, 2, quantile, .025)
    l2 <- apply(eff2, 2, quantile, .025)
    u1 <- apply(eff1, 2, quantile, .975)
    u2 <- apply(eff2, 2, quantile, .975)
    out <- data.frame(mean1 = m1, lower1 = l1, upper1 = u1, 
                      mean2 = m2, lower2 = l2, upper2 = u2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = unparties) %>% 
      cbind(sp[[i]] %>% slice(1) %>% select(case, year)) %>% 
      select(case, year, party, everything()) 
  }
  allout <- rbind(allout, out)
}

allout <- allout %>% 
  mutate(orig_party = party, 
         party = case_when(
           party == "conservative" ~ "Con", 
           party == "liberal" ~ "Lib", 
           party == "labour" ~ "Lab"), 
         party = factor(party, levels=c("Lab", "Lib", "Con"))) %>% 
  mutate(label = if_else(year == max(year), as.character(party), NA_character_))
output <- bind_rows(output, allout)


### By region

tmp_gbrr <- tmp_gbr %>% 
  ungroup %>% 
  filter(nchar(profile) > 1) %>% 
  group_by(profile, eyear, region) %>% 
  mutate(n=n()) %>% 
  filter(n >= 10)

gbr_yrs <- tmp_gbr %>% 
  group_by(region, eyear) %>% 
  tally() %>% 
  rename(overall_n = n) %>% 
  left_join(
    tmp_gbrr %>% 
      group_by(region, eyear) %>% 
      tally() %>% 
      rename(region_n = n)
  )

# Diagnostic plot: Compares number of parties included in regional versus national analysis for GBR
# gbr_yrs %>% 
#   pivot_longer(-(1:2), names_to="analysis", values_to = "n") %>% 
#   ggplot(aes(x=as.factor(eyear), y=n, fill=analysis)) + 
#   geom_bar(stat="identity", position="dodge") + 
#   theme_classic() + 
#   theme(legend.position="top", 
#         axis.text.x = element_text(angle=90, hjust=1)) + 
#   facet_wrap(~region, scales="free_y")
# ggsave("gbr_yr_region_comparison2.png", height=6, width=9, dpi=300)

gbr_regs <- tmp_gbrr %>% 
  group_by(eyear, region, profile) %>% 
  select(eyear, region, profile) %>% 
  slice_head(n=1)

res <- list()
pb <- progress_bar$new(total = nrow(gbr_regs))

for(i in 1:nrow(gbr_regs)){
  pb$tick()
  tmpj <- tmp_gbrr %>% 
    filter(eyear == gbr_regs$eyear[i] & region == gbr_regs$region[i] & profile == gbr_regs$profile[i])
  comp <- tmpj %>% ungroup %>% select(starts_with("pct_")) %>% as.matrix()
  comp <- comp[,-which(apply(comp, 2, \(x)all(is.na(x)))), drop=FALSE]
  mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity, data=tmpj))
  d1 <- tmpj %>% 
    group_by(region) %>% 
    mutate(logdensity = min(logdensity, na.rm=TRUE))
  d2 <- tmpj %>% 
    group_by(region) %>% 
    mutate(logdensity = max(logdensity, na.rm=TRUE))
  diff <- boot_cr(mod, d1, d2, cn=gsub("pct_", "", colnames(comp)))
  res[[i]] <- list(id = data.frame(n = nrow(tmpj), case = tmpj$case[1], year = tmpj$eyear[1], profile=tmpj$profile[1], region = tmpj$region[1]), 
                   diff = diff)
}

id_dat <- lapply(res, \(x)x$id) %>% 
  bind_rows() %>% 
  mutate(obs = 1:n())

sp <- id_dat %>% 
  group_by(year, region) %>% 
  group_split()
allout <- NULL
for(i in 1:length(sp)){
  if(nrow(sp[[i]]) == 1){
    zz <- apply(res[[sp[[i]]$obs]]$diff, c(2,3), mean)
    m <- rowMeans(zz)
    l <- apply(zz, 1, quantile, .025)
    u <- apply(zz, 1, quantile, .975)
    out1 <- out2 <- rbind(m, l, u)
    rownames(out1) <- c("mean1", "lower1", "upper1")
    rownames(out2) <- c("mean2", "lower2", "upper2")
    colnames(out1) <- colnames(out2) <- attr(res[[sp[[i]]$obs]]$diff, "parties")
    out <- bind_cols(t(out1), t(out2)) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = colnames(out1)) %>% 
      bind_cols(sp[[i]]) %>% 
      select(case, year, party, everything()) %>% 
      select(-n, -obs)
  }else{
    parties <- lapply(sp[[i]]$obs, \(i)attr(res[[i]]$diff, "parties"))
    diffs <- lapply(sp[[i]]$obs, \(i)res[[i]]$diff)
    unparties <- unique(c(unlist(parties)))
    inparts <- sapply(unparties, \(z)as.numeric(inl(z, parties)))
    inparts2 <- inparts
    inparts2[which(inparts2 == 0, arr.ind=TRUE)] <- 1
    n <- sp[[i]]$n
    pct1 <- apply(inparts, 2, \(z)z*n)
    pct2 <- apply(inparts2, 2, \(z)z*n)
    pct1 <- prop.table(pct1, 2)
    pct2 <- prop.table(pct2, 2)
    eff1 <- eff2 <- NULL
    for(p in unparties){
      laps <- lapply(parties, \(x){w <- which(x == p)})
      laps <- sapply(laps, \(x)ifelse(length(x) > 0, x, 0))
      d <- lapply(1:length(laps), \(k){
        if(laps[k] != 0){
          diffs[[k]][,laps[k],]
        }else{
          array(0, dim = dim(diffs[[k]])[c(1,3)])
        }
      })
      effs <- sapply(d, colMeans)
      eff1 <- cbind(eff1, effs %*% pct1[,p])
      eff2 <- cbind(eff2, effs %*% pct2[,p])
    }
    m1 <- colMeans(eff1)
    m2 <- colMeans(eff2)
    l1 <- apply(eff1, 2, quantile, .025)
    l2 <- apply(eff2, 2, quantile, .025)
    u1 <- apply(eff1, 2, quantile, .975)
    u2 <- apply(eff2, 2, quantile, .975)
    out <- data.frame(mean1 = m1, lower1 = l1, upper1 = u1, 
                      mean2 = m2, lower2 = l2, upper2 = u2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = unparties) %>% 
      cbind(sp[[i]] %>% slice(1) %>% select(case, year, region, profile)) %>% 
      select(case, year, party, everything()) 
  }
  allout <- rbind(allout, out)
}

allout <- allout %>% 
  mutate(orig_party = party, 
         party = case_when(
           party == "conservative" ~ "Con", 
           party == "liberal" ~ "Lib", 
           party == "labour" ~ "Lab"), 
         party = factor(party, levels=c("Lab", "Lib", "Con"))) %>% 
  mutate(label = if_else(year == max(year), as.character(party), NA_character_))

output_reg <- bind_rows(output_reg, allout)


## USA ----

usa <- rio::import("usa_cq.dta") %>% 
  dplyr::select(distnameflat,logdensity, case, eyear, pty, pct, win, region, ptygrp1, ptygrp2) %>%
  mutate(pct = ifelse(pct == 0, NA, pct))
#  filter(ptygrp1 == "Democrat" | ptygrp1 == "Republican")

usa %>% group_by(distnameflat, eyear) %>% slice_head(n=1) %>% nrow()

## Total competitive district-years
usa_comp_ridings <- usa %>% 
  group_by(distnameflat, eyear) %>% 
  filter(n() > 1) %>% 
  slice_head(n=1) %>% 
  nrow()
usa_comp_ridings


usa <- usa %>% 
  group_by(ptygrp2) %>% 
  mutate(abb1 = paste0("p", cur_group_id()))%>%
  group_by(distnameflat, eyear) %>% 
  mutate(prof1 = paste(sort(unique(abb1)), collapse="_"))
usa_ag <- usa %>% 
  group_by(distnameflat, eyear) %>% 
  reframe(prof1 = paste(sort(unique(abb1)), collapse="_"))
usa_ag1 <- usa_ag %>% group_by(eyear, prof1) %>% tally()
incl_prof <- usa_ag1 %>% ungroup %>% filter(n>= 10 & nchar(prof1) > 2) %>% select(prof1) %>% pull() %>% unique()

n <- usa %>% 
  filter(prof1 %in% incl_prof) %>% 
  group_by(eyear, distnameflat, prof1) %>% 
  slice_head(n=1) %>% 
  group_by(eyear, prof1) %>% 
  tally() %>% 
  ungroup %>% 
  select(n) %>% 
  pull
## total included districts
sum(n[which(n >= 10)])
## Proportion of total competitive districts
sum(n[which(n >= 10)])/usa_comp_ridings


tmp_usa <- usa %>% 
  group_by(distnameflat, case, eyear, logdensity, region, ptygrp2) %>% 
  summarise(pct = sum(pct)) %>% 
  pivot_wider(id_cols = c("distnameflat", "case", "eyear", "logdensity", "region"), values_from="pct", names_from="ptygrp2", names_prefix = "pct_") %>% 
  select(-"pct_")


prof <- tmp_usa %>% 
  ungroup %>% 
  select(starts_with("pct_"))
tx <- c("D", "M", "R")
ptymap <- data.frame(prof = tx, party = c("Democrat", "Minor", "Republican"))

tmp_usa$profile <- apply(prof, 1, \(x)tx %x% x)
tmp_usa <- tmp_usa %>%
  group_by(eyear, profile) %>% 
  mutate(n = n()) %>%
  ungroup %>% 
  mutate(incl_n = n >= 10, 
         incl_comp = nchar(profile) > 1, 
         incl = incl_n & incl_comp)

## Exclusion by Year
tmp_usa %>% group_by(eyear) %>% summarise(excl = sum(!incl)) %>% 
  mutate(cs = cumsum(excl), 
         cp = cs/sum(excl)) %>% 
  as.data.frame()

## Exclusion by Region
tmp_usa %>% group_by(region) %>% summarise(excl = sum(!incl)) %>% 
  mutate(p = excl/sum(excl)) %>% 
  as.data.frame()

### Figure B5: USA, Density by Inclusion/Exclusion Boxplot ----

tmp_usa %>% 
  mutate(included = ifelse(incl, "Included", "Excluded")) %>% 
  ggplot(aes(x=included, y=logdensity)) + 
  geom_boxplot() + 
  theme_classic() + 
  labs(x="", y="log(Density)")
ggsave("Figure B5_USA.png", 
       height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale, dpi = 600)

### Figure B6: USA, District-Years in Which Parties are Included/Excluded ----

excl_pty <- tmp_usa %>% 
  filter(!incl) %>% 
  select(profile) %>% 
  pull %>% 
  table %>% 
  as_tibble() %>% 
  setNames(c("profile", "n")) %>% 
  mutate(prof = str_split(profile, "")) %>% 
  unnest(prof) %>% 
  group_by(prof) %>% 
  summarise(n_excl = sum(n))

incl_pty <- tmp_usa %>% 
  filter(incl) %>% 
  select(profile) %>% 
  pull %>% 
  table %>% 
  as_tibble() %>% 
  setNames(c("profile", "n")) %>% 
  mutate(prof = str_split(profile, "")) %>% 
  unnest(prof) %>% 
  group_by(prof) %>% 
  summarise(n_incl = sum(n))
full_join(incl_pty, excl_pty) %>% left_join(ptymap) %>% 
  select(party, n_incl, n_excl) %>% 
  rename(Included = n_incl, Excluded=n_excl) %>% 
  mutate(Included_pct = round((Included/(Included + Excluded))*100), 
         Excluded_pct = round((Excluded/(Included + Excluded))*100)) %>%
  knitr::kable()

tmp_usa <- tmp_usa %>% 
  ungroup %>% 
  filter(nchar(profile) > 1) %>% 
  group_by(profile, eyear) %>% 
  mutate(n=n()) %>% 
  filter(n >= 10)


unyrs <- sort(unique(tmp_usa$eyear))
res <- list()
k <- 1
pb <- progress_bar$new(total = length(unyrs))


for(i in 1:length(unyrs)){
  pb$tick()
  tmp <- tmp_usa %>% 
    filter(eyear == unyrs[i]) 
  
  profs <- table(tmp$profile)
  library(progress)
  for(j in 1:length(profs)){
    tmpj <- tmp %>% filter(profile == names(profs)[j])
    comp <- tmpj %>% ungroup %>% select(starts_with("pct_")) %>% as.matrix()
    if(any(apply(comp, 2, \(x)all(is.na(x))))){
      comp <- comp[,-which(apply(comp, 2, \(x)all(is.na(x)))), drop=FALSE]  
    }
    mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity +region, data=tmpj))
    if(inherits(mod, "try-error")){
      mod <- lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity, data=tmpj)
    }
    d1 <- tmpj %>% 
      group_by(region) %>% 
      mutate(logdensity = min(logdensity))
    d2 <- tmpj %>% 
      group_by(region) %>% 
      mutate(logdensity = max(logdensity))
    diff <- boot_cr(mod, d1, d2, cn=gsub("pct_", "", colnames(comp)))
    res[[k]] <- list(id = data.frame(n = nrow(tmpj), case = tmpj$case[1], year = tmpj$eyear[1]), 
                     diff = diff)
    k <- k+1
    
    
  }
}
id_dat <- lapply(res, \(x)x$id) %>% 
  bind_rows() %>% 
  mutate(obs = 1:n())

sp <- split(id_dat, id_dat$year)
allout <- NULL
for(i in 1:length(sp)){
  if(nrow(sp[[i]]) == 1){
    zz <- apply(res[[sp[[i]]$obs]]$diff, c(2,3), mean)
    m <- rowMeans(zz)
    l <- apply(zz, 1, quantile, .025)
    u <- apply(zz, 1, quantile, .975)
    out1 <- out2 <- rbind(m, l, u)
    rownames(out1) <- c("mean1", "lower1", "upper1")
    rownames(out2) <- c("mean2", "lower2", "upper2")
    colnames(out1) <- colnames(out2) <- attr(res[[sp[[i]]$obs]]$diff, "parties")
    out <- bind_cols(t(out1), t(out2)) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = colnames(out1)) %>% 
      bind_cols(sp[[i]]) %>% 
      select(case, year, party, everything()) %>% 
      select(-n, -obs)
  }else{
    parties <- lapply(sp[[i]]$obs, \(i)attr(res[[i]]$diff, "parties"))
    diffs <- lapply(sp[[i]]$obs, \(i)res[[i]]$diff)
    unparties <- unique(c(unlist(parties)))
    inparts <- sapply(unparties, \(z)as.numeric(inl(z, parties)))
    inparts2 <- inparts
    inparts2[which(inparts2 == 0, arr.ind=TRUE)] <- 1
    n <- sp[[i]]$n
    pct1 <- apply(inparts, 2, \(z)z*n)
    pct2 <- apply(inparts2, 2, \(z)z*n)
    pct1 <- prop.table(pct1, 2)
    pct2 <- prop.table(pct2, 2)
    eff1 <- eff2 <- NULL
    for(p in unparties){
      laps <- lapply(parties, \(x){w <- which(x == p)})
      laps <- sapply(laps, \(x)ifelse(length(x) > 0, x, 0))
      d <- lapply(1:length(laps), \(k){
        if(laps[k] != 0){
          diffs[[k]][,laps[k],]
        }else{
          array(0, dim = dim(diffs[[k]])[c(1,3)])
        }
      })
      effs <- sapply(d, colMeans)
      eff1 <- cbind(eff1, effs %*% pct1[,p])
      eff2 <- cbind(eff2, effs %*% pct2[,p])
    }
    m1 <- colMeans(eff1)
    m2 <- colMeans(eff2)
    l1 <- apply(eff1, 2, quantile, .025)
    l2 <- apply(eff2, 2, quantile, .025)
    u1 <- apply(eff1, 2, quantile, .975)
    u2 <- apply(eff2, 2, quantile, .975)
    out <- data.frame(mean1 = m1, lower1 = l1, upper1 = u1, 
                      mean2 = m2, lower2 = l2, upper2 = u2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = unparties) %>% 
      cbind(sp[[i]] %>% slice(1) %>% select(case, year)) %>% 
      select(case, year, party, everything()) 
  }
  allout <- rbind(allout, out)
}

allout <- allout %>% 
  mutate(orig_party = party, 
         party = case_when(
           party == "Democrat" ~ "Dem", 
           party == "Republican" ~ "Rep"), 
         party = factor(party, levels=c("Dem", "Rep"))) %>% 
  mutate(label = if_else(year == max(year), as.character(party), NA_character_))

output <- bind_rows(output, allout)

tmp_usar <- tmp_usa %>% 
  ungroup %>% 
  filter(nchar(profile) > 1) %>% 
  group_by(profile, eyear, region) %>% 
  mutate(n=n()) %>% 
  filter(n >= 10)

usa_regs <- tmp_usar %>% 
  group_by(eyear, region, profile) %>% 
  select(eyear, region, profile) %>% 
  slice_head(n=1)

res <- list()
library(progress)
pb <- progress_bar$new(total = nrow(usa_regs))

for(i in 1:nrow(usa_regs)){
  pb$tick()
  tmpj <- tmp_usar %>% 
    filter(eyear == usa_regs$eyear[i] & region == usa_regs$region[i] & profile == usa_regs$profile[i])
  comp <- tmpj %>% ungroup %>% select(starts_with("pct_")) %>% as.matrix()
  if(any(apply(comp, 2, \(x)all(is.na(x))))){
    comp <- comp[,-which(apply(comp, 2, \(x)all(is.na(x)))), drop=FALSE]
  }
  mod <- try(lm(apply(alr(aplus(comp)), 2, as.numeric) ~ logdensity, data=tmpj))
  d1 <- tmpj %>% 
    group_by(region) %>% 
    mutate(logdensity = min(logdensity, na.rm=TRUE))
  d2 <- tmpj %>% 
    group_by(region) %>% 
    mutate(logdensity = max(logdensity, na.rm=TRUE))
  diff <- boot_cr(mod, d1, d2, cn=gsub("pct_", "", colnames(comp)))
  res[[i]] <- list(id = data.frame(n = nrow(tmpj), case = tmpj$case[1], year = tmpj$eyear[1], profile=tmpj$profile[1], region = tmpj$region[1]), 
                   diff = diff)
}

id_dat <- lapply(res, \(x)x$id) %>% 
  bind_rows() %>% 
  mutate(obs = 1:n())

sp <- id_dat %>% 
  group_by(year, region) %>% 
  group_split()
allout <- NULL
for(i in 1:length(sp)){
  if(nrow(sp[[i]]) == 1){
    zz <- apply(res[[sp[[i]]$obs]]$diff, c(2,3), mean)
    m <- rowMeans(zz)
    l <- apply(zz, 1, quantile, .025)
    u <- apply(zz, 1, quantile, .975)
    out1 <- out2 <- rbind(m, l, u)
    rownames(out1) <- c("mean1", "lower1", "upper1")
    rownames(out2) <- c("mean2", "lower2", "upper2")
    colnames(out1) <- colnames(out2) <- attr(res[[sp[[i]]$obs]]$diff, "parties")
    out <- bind_cols(t(out1), t(out2)) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = colnames(out1)) %>% 
      bind_cols(sp[[i]]) %>% 
      select(case, year, party, everything()) %>% 
      select(-n, -obs)
  }else{
    parties <- lapply(sp[[i]]$obs, \(i)attr(res[[i]]$diff, "parties"))
    diffs <- lapply(sp[[i]]$obs, \(i)res[[i]]$diff)
    unparties <- unique(c(unlist(parties)))
    inparts <- sapply(unparties, \(z)as.numeric(inl(z, parties)))
    inparts2 <- inparts
    inparts2[which(inparts2 == 0, arr.ind=TRUE)] <- 1
    n <- sp[[i]]$n
    pct1 <- apply(inparts, 2, \(z)z*n)
    pct2 <- apply(inparts2, 2, \(z)z*n)
    pct1 <- prop.table(pct1, 2)
    pct2 <- prop.table(pct2, 2)
    eff1 <- eff2 <- NULL
    for(p in unparties){
      laps <- lapply(parties, \(x){w <- which(x == p)})
      laps <- sapply(laps, \(x)ifelse(length(x) > 0, x, 0))
      d <- lapply(1:length(laps), \(k){
        if(laps[k] != 0){
          diffs[[k]][,laps[k],]
        }else{
          array(0, dim = dim(diffs[[k]])[c(1,3)])
        }
      })
      effs <- sapply(d, colMeans)
      eff1 <- cbind(eff1, effs %*% pct1[,p])
      eff2 <- cbind(eff2, effs %*% pct2[,p])
    }
    m1 <- colMeans(eff1)
    m2 <- colMeans(eff2)
    l1 <- apply(eff1, 2, quantile, .025)
    l2 <- apply(eff2, 2, quantile, .025)
    u1 <- apply(eff1, 2, quantile, .975)
    u2 <- apply(eff2, 2, quantile, .975)
    out <- data.frame(mean1 = m1, lower1 = l1, upper1 = u1, 
                      mean2 = m2, lower2 = l2, upper2 = u2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      mutate(party = unparties) %>% 
      cbind(sp[[i]] %>% slice(1) %>% select(case, year, region, profile)) %>% 
      select(case, year, party, everything()) 
  }
  allout <- rbind(allout, out)
}

allout <- allout %>% 
  mutate(orig_party = party, 
         party = case_when(
           party == "Democrat" ~ "Dem", 
           party == "Republican" ~ "Rep"), 
         party = factor(party, levels=c("Dem", "Rep"))) %>% 
  mutate(label = if_else(year == max(year), as.character(party), NA_character_))

output_reg <- bind_rows(output_reg, allout)

vs <- output
reg <- output_reg

## Create Figure 4 ----

can.elec <- can.elec.labs <- vs %>% filter(case=="CAN") %>% group_by(year) %>% slice(1) %>% pull(year)
can.elec.labs[c(3,11,13,19,26)] <- ""
usa.elec <- vs %>% filter(case=="USA") %>% group_by(year) %>% slice(1) %>% pull(year)
gbr.elec <- gbr.elec.labs <- vs %>% filter(case=="GBR") %>% mutate(year = round(year, digits=0)) %>% group_by(year) %>% slice(1) %>% pull(year)
gbr.elec.labs[c(2,6,10,22)] <- ""


vs.can <- vs %>% filter(orig_party %in% c("CCF_NDP", "Conservative", "Liberal")) %>%
  filter(case=="CAN") %>% 
  mutate(party = car::recode(orig_party, "'Conservative' = 'Con'; 'CCF_NDP' = 'NDP'; 'Liberal' = 'Lib'")) %>%
  mutate(label = if_else(year == max(year), as.character(party), NA_character_)) %>%
  ggplot(., aes(x=year, y=mean1, ymin=lower1, ymax=upper1, group=party, color=party, fill=party)) + 
  geom_hline(yintercept=0, linetype="dotted") + 
  geom_ribbon(alpha=0.2, color=NA) + geom_line() + 
  scale_x_continuous(limits=c(1920,2030), breaks=can.elec, labels=can.elec.labs) + 
  scale_y_continuous(labels=scales::percent, limits=c(-0.5, 0.5), breaks=seq(-0.5,0.5,0.25)) + 
  theme_minimal() + theme(panel.border = element_rect(fill=NA, size=0.5),
                          axis.text.x = element_text(angle=90, size=7),
                          axis.text.y = element_text(size=7), 
                          panel.grid.major.y = element_blank(),
                          panel.grid.minor.y = element_blank(),
                          panel.grid.minor.x = element_blank(),
                          title=element_text(size=8),
                          legend.position = "none") +
  xlab("") + ylab("") + 
  scale_fill_manual(values=c("blue", "red", "orange")) + 
  scale_color_manual(values=c("blue", "red", "orange")) + 
  ggtitle("Canada") + 
  geom_text(aes(label = label), na.rm = TRUE, hjust=-.1)

vs.usa <- vs %>% filter(orig_party=="Republican" | orig_party=="Democrat") %>%
  filter(case=="USA") %>% 
  mutate(party = car::recode(orig_party, "'Republican' = 'Rep'; 'Democrat' = 'Dem'")) %>%
  mutate(label = if_else(year == max(year), as.character(party), NA_character_)) %>%
  ggplot(., aes(x=year, y=mean1, ymin=lower1, ymax=upper1, group=party, color=party, fill=party)) + 
  geom_hline(yintercept=0, linetype="dotted") + 
  geom_ribbon(alpha=0.2, color=NA) + geom_line() + 
  scale_x_continuous(limits=c(1920,2030), breaks=seq(1944, 2020, 4)) + 
  scale_y_continuous(labels=scales::percent, limits=c(-0.5, 0.5), breaks=seq(-0.5,0.5,0.25)) + 
  theme_minimal() + theme(panel.border = element_rect(fill=NA, size=0.5),
                          axis.text.x = element_text(angle=90, size=7),
                          axis.text.y = element_text(size=7), 
                          panel.grid.major.y = element_blank(),
                          panel.grid.minor.y = element_blank(),
                          panel.grid.minor.x = element_blank(),
                          title=element_text(size=8),
                          legend.position = "none") +
  xlab("") + ylab("") + 
  scale_fill_manual(values=c("blue", "red")) + 
  scale_color_manual(values=c("blue", "red")) + 
  ggtitle("United States") + 
  geom_text(aes(label = label), na.rm = TRUE, hjust=-.1)

vs.gbr <- vs %>% filter(orig_party=="labour" | orig_party=="liberal" | orig_party=="conservative") %>%
  filter(case=="GBR") %>% mutate(year = round(year, digits=0)) %>%
  mutate(party = car::recode(orig_party, "'liberal' = 'Lib'; 'conservative' = 'Con'; 'labour' = 'Lab'")) %>%
  mutate(label = if_else(year == max(year), as.character(party), NA_character_)) %>%
  ggplot(., aes(x=year, y=mean1, ymin=lower1, ymax=upper1, group=party, color=party, fill=party)) + 
  geom_hline(yintercept=0, linetype="dotted") + 
  geom_ribbon(alpha=0.2, color=NA) + geom_line() + 
  scale_x_continuous(limits=c(1920,2030), breaks=gbr.elec, labels=gbr.elec.labs) + 
  scale_y_continuous(labels=scales::percent, limits=c(-0.5, 0.5), breaks=seq(-0.5,0.5,0.25)) + 
  theme_minimal() + theme(panel.border = element_rect(fill=NA, size=0.5),
                          axis.text.x = element_text(angle=90, size=7),
                          axis.text.y = element_text(size=7), 
                          panel.grid.major.y = element_blank(),
                          panel.grid.minor.y = element_blank(),
                          panel.grid.minor.x = element_blank(),
                          title=element_text(size=8),
                          legend.position = "none") +
  xlab("") + ylab("") + 
  scale_fill_manual(values=c("blue", "red", "orange")) + 
  scale_color_manual(values=c("blue", "red", "orange")) + 
  ggtitle("Britain") + 
  geom_text(aes(label = label), na.rm = TRUE, hjust=-.1)

vs.can / vs.usa / vs.gbr

scale = 1.25
# ggsave("Figure 4_Expected Shift in Party Vote Share, Lowest Density to Highest Density District.pdf", 
#        height= 6 / scale, width= 4.25 / scale, units = "in", bg = "white", scale = scale)
ggsave("Figure 4_Expected Shift in Party Vote Share, Lowest Density to Highest Density District.png", 
       height= 6 / scale, width= 4.25 / scale, units = "in", bg = "white", scale = scale, dpi = 600)

# Figure 5: Expected Shift in Party Vote Share, USA by Region ----

reg %>% filter(orig_party=="Republican" | orig_party=="Democrat") %>%
  filter(case=="USA") %>% 
  mutate(party = car::recode(orig_party, "'Republican' = 'Rep'; 'Democrat' = 'Dem'")) %>%
  mutate(label = if_else(year == max(year), as.character(party), NA_character_)) %>%
  ggplot(., aes(x=year, y=mean1, ymin=lower1, ymax=upper1, group=party, color=party, fill=party)) + 
  geom_hline(yintercept=0, linetype="dotted") + facet_wrap(~region, ncol=1) + 
  geom_ribbon(alpha=0.2, color=NA) + geom_line() + 
  scale_x_continuous(limits=c(1944,2030), breaks=seq(1944,2020,4)) + 
  scale_y_continuous(labels=scales::percent, limits=c(-0.75, 0.75), breaks=seq(-0.75,0.75,0.25)) + 
  theme_minimal() + theme(panel.border = element_rect(fill=NA, size=0.5),
                          axis.text.x = element_text(angle=90, size=7),
                          axis.text.y = element_text(size=7), 
                          panel.grid.major.y = element_blank(),
                          panel.grid.minor.y = element_blank(),
                          panel.grid.minor.x = element_blank(),
                          title=element_text(size=8),
                          legend.position = "none") +
  xlab("") + ylab("") + 
  scale_fill_manual(values=c("blue", "red", "orange")) + 
  scale_color_manual(values=c("blue", "red", "orange")) + 
  geom_text(aes(label = label), na.rm = TRUE, hjust=-.1)

scale = 1.25
# ggsave("Figure 5_Expected Shift in Party Vote Share, USA by Region.pdf",
#        height= 6 / scale, width= 4.25 / scale, units = "in", bg = "white", scale = scale)
ggsave("Figure 5_Expected Shift in Party Vote Share, USA by Region.png",
       height= 6 / scale, width= 4.25 / scale, units = "in", bg = "white", scale = scale, dpi = 600)

# Figure C1: Expected Shift in Party Vote Share, Great Britain by Region ----

reg %>% filter(orig_party=="labour" | orig_party=="liberal" | orig_party=="conservative") %>%
  filter(case=="GBR") %>% mutate(year = round(year, digits=0)) %>%
  mutate(party = car::recode(orig_party, "'liberal' = 'Lib'; 'conservative' = 'Con'; 'labour' = 'Lab'")) %>%
  mutate(label = if_else(year == max(year), as.character(party), NA_character_)) %>%
  ggplot(., aes(x=year, y=mean1, ymin=lower1, ymax=upper1, group=party, color=party, fill=party)) + 
  geom_hline(yintercept=0, linetype="dotted") + 
  facet_wrap(~region, ncol=3) + 
  geom_ribbon(alpha=0.2, color=NA) + geom_line() + 
  scale_x_continuous(limits=c(1925,2035), breaks=seq(1930, 2020, 10)) + 
  scale_y_continuous(labels=scales::percent, limits=c(-0.75, 0.75), breaks=seq(-0.75,0.75,0.25)) + 
  theme_minimal() + theme(panel.border = element_rect(fill=NA, size=0.5),
                          axis.text.x = element_text(angle=90, size=7),
                          axis.text.y = element_text(size=7), 
                          panel.grid.major.y = element_blank(),
                          panel.grid.minor.y = element_blank(),
                          panel.grid.minor.x = element_blank(),
                          title=element_text(size=8),
                          legend.position = "none") +
  xlab("") + ylab("") + 
  scale_fill_manual(values=c("blue", "red", "orange")) + 
  scale_color_manual(values=c("blue", "red", "orange")) + 
  geom_text(aes(label = label), na.rm = TRUE, hjust=-.1)

scale = 1
# ggsave("Figure C1_Expected Shift in Party Vote Share, Great Britain by Region.pdf", 
#        height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale)
ggsave("Figure C1_Expected Shift in Party Vote Share, Great Britain by Region.png", 
       height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale, dpi = 600)

# Diagnostic plot not included in paper: Expected Shift in Party Vote Share, Canada by Region ----
# reg %>% filter(orig_party=="CCF_NDP" | orig_party=="Conservative" | orig_party=="Liberal") %>%
#   filter(case=="CAN") %>% 
#   mutate(party = car::recode(orig_party, "'Conservative' = 'Con'; 'CCF_NDP' = 'NDP'; 'Liberal' = 'Lib'")) %>%
#   mutate(label = if_else(year == max(year), as.character(party), NA_character_)) %>%
#   ggplot(., aes(x=year, y=mean1, ymin=lower1, ymax=upper1, group=party, color=party, fill=party)) + 
#   geom_hline(yintercept=0, linetype="dotted") + facet_wrap(~region, ncol=1) + 
#   geom_ribbon(alpha=0.2, color=NA) + geom_line() + 
#   scale_x_continuous(limits=c(1920,2025), breaks=can.elec) + 
#   scale_y_continuous(labels=scales::percent, limits=c(-0.75, 0.75), breaks=seq(-0.75,0.75,0.25)) + 
#   theme_minimal() + theme(panel.border = element_rect(fill=NA, size=0.5),
#                           axis.text.x = element_text(angle=90, size=7),
#                           axis.text.y = element_text(size=7), 
#                           panel.grid.major.y = element_blank(),
#                           panel.grid.minor.y = element_blank(),
#                           panel.grid.minor.x = element_blank(),
#                           title=element_text(size=8),
#                           legend.position = "none") +
#   xlab("") + ylab("") + 
#   scale_fill_manual(values=c("blue", "red", "orange")) + 
#   scale_color_manual(values=c("blue", "red", "orange")) + 
#   geom_text(aes(label = label), na.rm = TRUE, hjust=-.1)
# 
# scale = 1.25
# # ggsave("Figure X_Expected Shift in Party Vote Share, Canada by Region.pdf", 
# #        height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale)
# ggsave("Figure X_Expected Shift in Party Vote Share, Canada by Region.png", 
#        height= 8 / scale, width= 6.5 / scale, units = "in", bg = "white", scale = scale, dpi = 600)


# close log file
closeAllConnections()
